/******************************************************************************/
/* Frank Fossen, Levent Neyse, and Carsten Schroeder                          */
/* Does Cognitive Reflection Relate to Preferences & Socio-Economic Outcomes? */
/* Journal of Political Economy Microeconomics (accepted 2024)                */
/******************************************************************************/

/******************************************************************************/
/* Data preparation                                                           */
/******************************************************************************/

set varabbrev off
set more off

clear

cap log close
log using "${MY_RES_PATH}01_prepare_SOEPIS2020.log", text replace

#delimit;


/***************************/
/* Merge data source files */
/***************************/

/* Person level variables from SOEP-IS long */
use pid hid syear
	plh0204 /* willingness to take risk */
	plh0253 /* Patience */
	plh0254 /* Impulsivneness */
	plh0192 plh0193 plh0194 /* Trust */
	plh0206 plh0209 plh0211 /* Positive reciprocity */
	pspa01 pspa02a pspa02b pspa02c pspa02d pspa02 /* Financial assets */
	plh0004 /* Political orientation */
	pli0098_v2 /* Religiousness */
	ple0008 /* Health */
	plh0182 /* Life satisfaction */
	pld0047 /* Number of friends */
	using "${SOEPIS_IN_PATH}p.dta", clear;
sort pid syear;


/* Merge generated variables */
merge 1:1 syear pid using "${SOEPIS_IN_PATH}pgen.dta",
	keepusing(
		pglabgro /* Individual gross labor income */
		pgstib /* Civil servant */
		pgsbil pgbbil01 pgbbil02 /* education */
	);
	
drop if _merge==2;
drop _merge;

	
/* Merge variables from cognitive abilities tests */
merge 1:1 syear pid using "${SOEPIS_IN_PATH}cognit.dta",
	keepusing(f025r f099r90); /* Cognitive ability: correct animal names and correct numbers within 90 seconds */

drop if _merge==2;
drop _merge;


/* Merge variables from innovation modules */
merge 1:1 syear pid using "${SOEPIS_IN_PATH}inno.dta", gen(mergeInno)
	keepusing(
		/* CRT module: */
		isach01_1 isach01_2 isach02 isach03 isachrichtig isachk03 
		isach01_wn isach02_wn isach03_wn isachk01 isachk01_wn isachk02 
		isachk02_wn
		ifl02a1 /* altruism */
		donat_a3 /* donations */
	);
/* Drop unmatched records later after calculating averages */


/* Merge year of birth and gender */
merge m:1 pid using "${SOEPIS_IN_PATH}ppfad.dta",
	keepusing(gebjahr sex);
	
drop if _merge==2;
drop _merge;


/* Merge household variables */
merge m:1 syear hid using "${SOEPIS_IN_PATH}h.dta",
	keepusing(
		hspar2a hspar2b /* saving */
	);
	
drop if _merge==2;
drop _merge;


/* Merge time stamps (not included in the standard SOEP-IS distribution,
   we obtained the variables directly from the SOEP group at DIW) */
rename pid pnrfest;
rename syear jahr;
merge 1:1 pnrfest jahr using "${MY_DATA_PATH}Inno20_Zeitmarken_ISACH.dta", 
	keepusing(ZXZ1ISACH01 ZXZ1ISACH02 ZXZ1ISACH03);

drop if _merge==2;
drop _merge;
	
rename pnrfest pid;
rename jahr syear;


/* Merge current student variable from 2020 SOEP-IS
   (variable in official SOEP-IS distribution is incompletely coded, 
   we obtained the corrected variable from Martin Gerike, DIW SOEP) */
merge m:1 pid using "${MY_DATA_PATH}paus_2020.dta", keepusing(plg0014) gen(mergeStudents);

drop if mergeStudents==2;
drop mergeStudents;


#delimit cr;


/**********************/
/* Generate variables */
/**********************/


/* Dependent variables: */
/************************/


/* Willingness to take risks */

gen riskpref = plh0204
replace riskpref = . if riskpref<0

tab plh0204, missing
tab riskpref, mis
tab syear riskpref, mis


/* Patience */

gen patience = plh0253
replace patience = . if patience<0

tab plh0253, missing
tab patience, mis
tab syear patience, mis


/* Impulsivneness */

gen impulsive = plh0254
replace impulsive = . if impulsive<0

tab plh0254, missing
tab impulsive, mis
tab syear impulsive, mis


/* Altruism */

gen altruism = ifl02a1
replace altruism = . if altruism<0

tab ifl02a1, missing
tab altruism, mis
tab syear altruism, mis


/* Trust */
gen trust = ((5-plh0192) + plh0193 + plh0194) / 3
replace trust = . if plh0192<0 | plh0193<0 | plh0194<0

tab plh0192, mis
tab plh0193, mis
tab plh0194, mis
tab trust, missing
tab syear trust, missing


/* Positive reciprocity */

gen reciprocity = (plh0206 + plh0209 + plh0211) / 3
replace reciprocity = . if plh0206<0 | plh0209<0 | plh0211<0

tab plh0206, mis
tab plh0209, mis
tab plh0211, mis
tab reciprocity, missing
tab syear reciprocity, missing


/* Monthly gross income */

gen grossinc = pglabgro
replace grossinc=0 if pglabgro==-2 /* Does not apply means no income */
replace grossinc=. if pglabgro==-1 | pglabgro<=-3 /* No answer or answer improbable */

tab pglabgro if pglabgro<0, mis
sum pglabgro grossinc
bys syear: sum pglabgro grossinc


/* Financial assets */

gen finassets = pspa02
replace finassets = . if pspa02==-2 /* Does not apply. Can mean "no answer" or "no finacial assets". */
replace finassets = . if pspa02==-5 /* Question not contained in questionnaire. */

replace finassets = 0 if pspa01==2 /* No financial assets. This is included in the code "does not apply" in pspa02. */

sum finassets


sum finassets if finassets>0 & finassets<5000 /* Only those who have positive financial assets are asked whether their assets are less than euro 5000. */
replace finassets=r(mean) if finassets==. & pspa02c==2

sum finassets if finassets>5000 & finassets<20000
replace finassets=r(mean) if finassets==. & pspa02c==1 & pspa02a==2

sum finassets if finassets>20000 & finassets<100000
replace finassets=r(mean) if finassets==. & pspa02a==1 & pspa02b==2

sum finassets if finassets>100000 & finassets<1000000
replace finassets=r(mean) if finassets==. & pspa02b==1 & pspa02d==2

replace finassets=1000000 if finassets==. & pspa02d==1

sum finassets
bys syear: sum finassets


/* Political orientation */

gen polRight = plh0004
replace polRight = . if polRight<0 | polRight>=12

tab plh0004, missing
tab polRight, mis
tab syear polRight, mis


/* Religiousness */

gen religion = 6-pli0098_v2
replace religion = . if pli0098_v2<0

tab pli0098_v2, missing
tab religion, mis
tab syear religion, mis


/* Health */

gen health = 6-ple0008
replace health = . if ple0008<0

tab ple0008, missing
tab health, mis
tab syear health, mis


/* Life satisfaction */

gen lifeSatis = plh0182
replace lifeSatis = . if lifeSatis<0

tab plh0182, missing
tab lifeSatis, mis
tab syear lifeSatis, mis


/* Civil servant */


gen civilServ=0
replace civilServ=1 if pgstib>=610 & pgstib<=640
replace civilServ=. if pgstib<0

tab syear civilServ, mis


/* Savings */

sum hspar2a hspar2b
by syear: sum hspar2a hspar2b

tab hspar2a if hspar2a<0, missing
tab hspar2b if hspar2b<0, missing

tab syear hspar2a if hspar2a<0, missing
tab syear hspar2b if hspar2b<0, missing

gen assetFormation = hspar2a
replace assetFormation = . if hspar2a<0 & hspar2a!=-2 /* Not in quesionnaire, answer improbable, no answer */
replace assetFormation = 0 if hspar2a==-2 /* Does not apply means 0 saving */

gen precautSav = hspar2b
replace precautSav = . if hspar2b<0 & hspar2b!=-2 /* Not in quesionnaire, answer improbable, no answer */
replace precautSav = 0 if hspar2b==-2 /* Does not apply means 0 saving */

gen saving = assetFormation + precautSav

sum saving assetFormation precautSav
by syear: sum saving assetFormation precautSav


/* Donations */

tab donat_a3, mis
tab donat_a3 if donat_a3<0, mis
tab syear donat_a3 if donat_a3<0, mis

gen donations = donat_a3
replace donations=0 if donations==-2 /* Does not apply means no donations */
replace donations=. if donations==-1 | donations<=-5 /* No answer or not included */

sum donat_a3 donations
bys syear: sum donat_a3 donations


/* Number of close friends */

tab pld0047, mis
tab pld0047 if pld0047<0, mis
tab syear pld0047 if pld0047<0, mis

gen friends = pld0047
replace friends=. if friends==-1 | friends<=-5 /* No answer or not included */

sum pld0047 friends
bys syear: sum pld0047 friends


/* Age */

gen age=syear-gebjahr
replace age=. if gebjahr<0 /* 1 observation */

gen agesq = age*age


/* Gender */

gen female = (sex==2)
replace female=. if sex<0


/* Now University student */

tab syear plg0014, mis
/* Note that the coding is different from what's described in paneldata.org */

gen student2020 = 0
replace student2020 = 1 if plg0014==2
replace student2020 = . if plg0014==-1 | plg0014==-3 /* no answer or implausible */

tab syear student2020, mis


/* Education variables */
/***********************/

tab pgsbil, mis
tab pgsbil, mis nolabel

tab pgbbil01, mis
tab pgbbil01, mis nolabel

tab pgbbil02, mis
tab pgbbil02, mis nolabel


gen highschool=0
replace highschool=1 if pgsbil==3 | pgsbil==4  /* Fachhochschulreife or Abitur */
replace highschool=. if pgsbil==-1
tab pgsbil highschool, mis

gen apprenticeship=0
replace apprenticeship=1 if pgbbil01==1
replace apprenticeship=. if pgbbil01==-1
/* -2 = does not apply: means no apprenticeship */

gen highertechncol=0
replace highertechncol=1 if pgbbil01==2 | pgbbil01==3 | pgbbil01==4 | pgbbil01==5 | pgbbil01==6 | pgbbil01==7
replace highertechncol=. if pgbbil01==-1
/* -2 = does not apply: means no such degree */
tab pgbbil01 apprenticeship, mis
tab pgbbil01 highertechncol, mis

gen university=0
replace university=1 if pgbbil02==1 | pgbbil02==2 | pgbbil02==3 | pgbbil02==4 | pgbbil02==5 | pgbbil02==6
replace university=. if pgbbil02==-1
/* -2 = does not apply: means no college degree */
tab pgbbil02 university, mis

tab highschool university

tab syear highschool, mis
tab syear apprenticeship, mis
tab syear highertechncol, mis
tab syear university, mis


/* Cognitive ability */
/*********************/

tab f025r syear, mis /* available in 2014 only */
gen wordsCorr = f025r
replace wordsCorr=. if wordsCorr<0 /* not included or does not apply */

tab f099r90 syear , mis /* available in 2014 and subsample of 2018 */
gen numbersCorr = f099r90
replace numbersCorr = . if numbersCorr<0 /* does not apply */


/*************************/
/* Prepare CRT variables */
/*************************/

#delimit;

/* Ball */

tab isach01_1, mis;
tab isach01_2, mis;
tab isach01_wn, mis;
tab isach01_wn, mis nolabel;

gen ballEuro = isach01_1;
replace ballEuro = . if ballEuro<0;

gen ballCents = isach01_2;
replace ballCents = . if ballCents<0;
/* The same individuals have negative values or missings in both Euro and Cents */

gen ballCentsCombi = ballEuro*100 + ballCents;

gen ballCorrect = (ballCentsCombi==5);
replace ballCorrect=. if ballCentsCombi==.;
replace ballCorrect=0 if isach01_wn==1;  /* Doesn't know answer */

tab ballCentsCombi ballCorrect, mis;
tab isach01_wn ballCorrect, mis;


/* Machine */

tab isach02, mis;
tab isach02_wn, mis;
tab isach02_wn, mis nolabel;

gen machine = isach02;
replace machine = . if isach02<0;

gen machineCorrect = (machine==5);
replace machineCorrect = . if machine==.;
replace machineCorrect = 0 if isach02_wn==1;


/* Lilies */

tab isach03, mis;
tab isach03_wn, mis;
tab isach03_wn, mis nolabel;

gen lilies = isach03;
replace lilies = . if isach03<0;

gen liliesCorrect = (lilies==47);
replace liliesCorrect = . if lilies==.;
replace liliesCorrect = 0 if isach03_wn==1;

tab isach03 liliesCorrect, mis;
tab isach03_wn liliesCorrect, mis;

tab isachrichtig, mis;
tab isachrichtig ballCorrect, mis;
tab isachrichtig machineCorrect, mis;
tab isachrichtig liliesCorrect, mis;

gen correctCount = ballCorrect + machineCorrect + liliesCorrect;


di "Average CRT score:";
replace correctCount=. if syear!=2020;
sum correctCount;
global correctCount_others = r(mean);

tab isachrichtig correctCount, mis;

tab isachk03, mis;
tab isachk03, mis nolabel;

gen questionsKnown = (isachk03==1);
replace questionsKnown = . if isachk03==-1 | isachk03==.;

tab isachk03 questionsKnown if syear==2020, mis;
tab questionsKnown correctCount if syear==2020, mis;



/* Time stamp data */
/*******************/

gen guess_self = isachk01;
replace guess_self=. if isachk01<0;


#delimit cr

destring ZXZ1ISACH01, gen(timeBall)
destring ZXZ1ISACH02, gen(timeMachine)
destring ZXZ1ISACH03, gen(timeLilies)

gen timeCRT = timeBall+timeMachine+timeLilies

order ZXZ1ISACH01 ZXZ1ISACH02 ZXZ1ISACH03 timeBall timeMachine timeLilies timeCRT


bys guess_self: sum timeCRT if questionsKnown!=1

sum timeCRT, det
sum timeCRT if questionsKnown!=1, det

gen fastCRT=0
replace fastCRT=1 if timeCRT<r(p5)
replace fastCRT=. if timeCRT==.

gen unmotivatedCRT=0
replace unmotivatedCRT=1 if timeCRT<r(p25) & guess_self<=1
replace unmotivatedCRT=. if timeCRT==. | guess_self==.

tab fastCRT unmotivatedCRT, mis
tab guess_self unmotivatedCRT, mis


/* Drop unmatched records from Inno file now after calculating average and time distribution */
drop if mergeInno==2
drop mergeInno


/**********************/
/* Forward imputation */
/**********************/

/* Variables not contained in 2020 SOEP-IS */
/* Also political orientation because only 2/9 of the 2020 SOEP-IS received this question */
local varnames "patience impulsive altruism trust finassets polRight donations wordsCorr numbersCorr"

sort pid syear

foreach varname in `varnames' {
	/* Forward imputation: */
    gen `varname'_fwd = `varname'
    replace `varname'_fwd=`varname'_fwd[_n-1] if pid==pid[_n-1] & `varname'_fwd==.
    sum `varname' `varname'_fwd
}


/* The end */

compress
save "${MY_DATA_PATH}SOEPIS2020prep.dta", replace
